library(SummarizedExperiment)
library(RColorBrewer)
library(knitr)
library(dplyr)
library(parallel)
library(ggplot2)
library(DESeq2)
library(wordcloud)
library(ComplexHeatmap)
library(circlize)
library(TxDb.Mmusculus.UCSC.mm10.knownGene)
## Warning: package 'GenomicFeatures' was built under R version 3.5.3
library(goseq)
library(stringr)
library(UpSetR)

Read the processed scRNA-seq data and find the DE genes between C1 (MEF) and C7 (Flk1+ cells at D7 of reprogramming)

se <- readRDS(gzcon(url('https://s3.msi.umn.edu/garry_projects/etv2_pioneer/processed_Etv2_scRNAseq.rds')))
n_cluster <- 7
set.seed(1); cls <- kmeans(colData(se)$umap, n_cluster)$cluster
cls <- as.numeric(factor(cls, c(1, 5, 4, 2, 3, 6, 7)))
colData(se)$cluster <- cls
X <- assays(se)$normalized_counts %>% as.matrix()
diff_C7_C1 <- Matrix::rowMeans(X[, cls == 7]) - Matrix::rowMeans(X[, cls == 1])
pvalue_C7_C1 <- unlist(mclapply(1:nrow(X), function(n) tryCatch(wilcox.test(X[n, cls == 7], X[n, cls == 1])$p.value, error = function(e) NA), mc.cores = 4))

Load the bulk RNA-seq of Etv2 induction in ES/EB.

se_rna <- readRDS(gzcon(url('https://s3.msi.umn.edu/gongx030/datasets/dataset=Etv2RNA-seq_version=20190909a/se.rds')))
se_rna <- DESeqDataSet(se_rna, design = ~ group)
## Warning in DESeqDataSet(se_rna, design = ~group): some variables in design
## formula are characters, converting to factors
se_rna <- estimateSizeFactors(se_rna)
se_rna <- DESeq(se_rna)
## using pre-existing size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
assays(se_rna)$normalized_counts <- log2(counts(se_rna, normalized = TRUE) + 1)
res <- results(se_rna, contrast = c('group', 'EB_Dox_D25_Flk1pos_Etv2', 'EB_NoDox_D25_Etv2'))

Merge the scRNA-seq and bulk RNA-seq by gene symbols.

y <- merge(
  data.frame(symbol = rownames(res), EB_log2FoldChange = res$log2FoldChange, EB_pvalue = res$pvalue), 
  data.frame(symbol = rowData(se)$name, MEF_log2FoldChange = diff_C7_C1, MEF_pvalue = pvalue_C7_C1), 
  by.x = 'symbol', 
  by.y = 'symbol'
)
y <- y[!is.na(y$EB_pvalue) & !is.na(y$MEF_pvalue), ]
y <- y[!duplicated(y$symbol), ]
rownames(y) <- y$symbol

Look at the genes that are consistently changed in ES/EB and MEF

n <- y$EB_pvalue < 1e-3
up_MEF <- y$MEF_pvalue < 1e-3 & y$MEF_log2FoldChange > 0 &  y$EB_log2FoldChange > 0
down_MEF <- y$MEF_pvalue < 1e-3 & y$MEF_log2FoldChange < 0 & y$EB_log2FoldChange < 0
bg <- rep('black', nrow(y))
names(bg) <- rownames(y)
bg[up_MEF] <- 'red'
bg[down_MEF] <- 'blue'
lm_model <- lm(MEF_log2FoldChange ~ EB_log2FoldChange - 1, y[n, ])
plot(MEF_log2FoldChange ~ EB_log2FoldChange, y[n, ], cex = 0.1, xlab = 'ES/EB', ylab = 'MEF', pch = 21, bg = bg[n], col = bg[n], main = sprintf('p=%.3e', coef(summary(lm_model))[1, 4]), xpd = TRUE)
abline(v = 0, h = 0, lty = 2)
lm_model <- lm(MEF_log2FoldChange ~ EB_log2FoldChange - 1, y[n, ])
abline(lm_model, lwd = 2, lty = 2, col = 'black')
gs <- c(
  'Etv2', 'Igfbp4', 'Kdr', 'Lmo2',  
  'Ets1', 'Sox18', 'Cdh5', 'Rhoj', 'Vim',
  'Elk3', 'Cd44', 'Tek', 'Gata2', 
  'Eng', 'Itga2b', 'Mest',
  'Egr1', 'Bcat1', 'Myc'
)
wordcloud::textplot(y[gs, 'EB_log2FoldChange'], y[gs, 'MEF_log2FoldChange'], gs, cex = 1.5, new = FALSE,col = bg[gs])

A heatmap of combined RNA-seq data

# genes up-regulated in both MEF and EB
up_up <- rownames(y)[y$EB_pvalue < 1e-3 & y$EB_log2FoldChange > log2(2) & y$MEF_pvalue < 1e-3 & y$MEF_log2FoldChange > 0.1] 
down_down <- rownames(y)[y$EB_pvalue < 1e-3 & y$EB_log2FoldChange < log2(1/2) & y$MEF_pvalue < 1e-3 & y$MEF_log2FoldChange < -0.1] 
gs <- c(up_up, down_down)
row_vector <- rep(1:2, c(length(up_up), length(down_down)))
# genes to highlight in the heatmap
genes <- c(
  'Etv2', 'Igfbp4', 'Kdr', 'Lmo2',  
  'Ets1', 'Sox18', 'Cdh5', 'Rhoj', 'Vim',
  'Elk3', 'Cd44', 'Tek', 'Gata2', 
  'Eng', 'Itga2b', 'Mest',
  'Egr1', 'Kras',
  'Emcn', 'Nrp1',
  'Prdm1', 'Srf', 'Yy1', 'Yes1', 'Amotl1', 'Tgfbr2', 'Myc', 'Ccnd1', 'Vav3'
)
genes <- genes[genes %in% gs]
col_fun <- colorRamp2(c(-3, 0, 3), c("blue", "white", "red"))
X_sc <- assays(se)$scaled_counts[, cls %in% c(1, 7)] %>% as.matrix()
col_group_sc <- colData(se)$group[cls %in% c(1, 7)]
rownames(X_sc) <- rowData(se)$name
group2bg <- c(
   'MEF_Dox_D1' = 'black',
   'MEF_NoDox' = 'blue',
   'MEF_Dox_D2' = 'purple',
   'MEF_Dox_D7a' = 'red',
   'MEF_Dox_D7b' = 'pink'
 )
cluster2bg <- colorRampPalette(brewer.pal(11,'Spectral'))(n_cluster)
names(cluster2bg) <- sprintf('C%d', 1:n_cluster)
column_annotation <- HeatmapAnnotation(
  group = colData(se)$group[cls %in% c(1, 7)], 
  cluster = sprintf('C%d', colData(se)$cluster[cls %in% c(1, 7)]),
  col = list(group = group2bg, cluster = cluster2bg)
)
Heatmap(X_sc[gs, ], row_split = row_vector, cluster_rows = FALSE, cluster_columns = FALSE, 
        show_row_names = FALSE, 
        col = col_fun, 
        show_row_dend = FALSE,
        top_annotation = column_annotation,
        )

X_b <- t(scale(t(log2(assays(se_rna)$normalized_counts + 1))))
rownames(X_b) <- rowData(se_rna)$refseq_mrna
group2stage <- c(
  'EB_Dox_D2_Etv2' = 'D2',
  'EB_Dox_D25_Etv2' = 'D2_5',
  'EB_Dox_D25_Flk1pos_Etv2' = 'D2_5_Flk1',
  'EB_NoDox_D2_Etv2' = 'D2',
  'EB_NoDox_D25_Etv2' = 'D2_5'
)
stage <- group2stage[colData(se_rna)$group]
group2dox <- c(
  'EB_Dox_D2_Etv2' = 'Dox',
  'EB_Dox_D25_Etv2' = 'Dox',
  'EB_Dox_D25_Flk1pos_Etv2' = 'Dox',
  'EB_NoDox_D2_Etv2' = 'NoDox',
  'EB_NoDox_D25_Etv2' = 'NoDox'
)
dox <- group2dox[colData(se_rna)$group]
column_annotation <- HeatmapAnnotation(
  stage = stage, 
  dox = dox,
  col = list(stage = c('D2' = 'green', 'D2_5' = 'gold', 'D2_5_Flk1' = 'pink'), dox = c('Dox' = 'black', 'NoDox' = 'white'))
)
mark_col <- rep('red', length(gs))
names(mark_col) <- gs
mark_col[gs %in% down_down] <- 'blue'
mark_at <- which(gs %in% genes)
row_annotation <- rowAnnotation(
  up_genes = anno_mark(
    at = mark_at, 
    labels = gs[mark_at], 
    labels_gp = gpar(fontsize = 15, col = mark_col[mark_at]),
    padding = unit(1, "mm")
  )
)
Heatmap(X_b[gs, ], row_split = row_vector, cluster_rows = FALSE, cluster_columns = TRUE, 
        show_row_names = FALSE, 
        col = col_fun, 
        show_row_dend = FALSE,
        show_column_dend = FALSE,
        top_annotation = column_annotation,
        right_annotation = row_annotation
)

Pathway analysis of commonly up-regulated genes, commonly down-regulated genes and up-regulated in ES/EB but not in MEF

Create a list of signifiant genes up-regulated in each cluster, fit the Probability Weighting Function (PWF) and find the significantly enriched GO terms.

up_up <- rownames(y)[y$EB_pvalue < 1e-3 & y$EB_log2FoldChange > log2(2) & y$MEF_pvalue < 1e-3 & y$MEF_log2FoldChange > 0.1] 
down_down <- rownames(y)[y$EB_pvalue < 1e-3 & y$EB_log2FoldChange < log2(1/2) & y$MEF_pvalue < 1e-3 & y$MEF_log2FoldChange < -0.1] 
up_nc <- rownames(y)[y$EB_pvalue < 1e-3 & y$EB_log2FoldChange > log2(2) & !(y$MEF_pvalue < 1e-3 & y$MEF_log2FoldChange > 0.1)]
down_nc <- rownames(y)[y$EB_pvalue < 1e-3 & y$EB_log2FoldChange < log2(1/2) & !(y$MEF_pvalue < 1e-3 & y$MEF_log2FoldChange < -0.1)]

go_res <- lapply(list(up_up, down_down, up_nc, down_nc), function(g){
   x <- rep(0, nrow(y))
   names(x) <- rownames(y)
   x[g] <- 1
   pwf <- nullp(x, "mm10", "geneSymbol", plot.fit = FALSE)
   goseq(pwf, "mm10","geneSymbol", test.cats = c("GO:BP"))
})
h <- 3
  go_res[[h]] %>% 
  filter(numInCat < 1000) %>%
  top_n(10, wt = -over_represented_pvalue) %>% 
  mutate(hitsPerc = numDEInCat * 100 / numInCat, term = str_wrap(term, 30)) %>% 
  ggplot(aes(x = hitsPerc, y = term, colour = over_represented_pvalue, size = numDEInCat)) +
  geom_point() +
  labs(title = sprintf('cluster %d', h), x="Hits (%)", y="", colour="p value", size="Count") +
  theme(axis.text = element_text(size = 15, face = 'bold')) + 
  theme(legend.text = element_text(size = 15, face = 'bold')) + 
  theme(legend.title = element_text(size = 15, face = 'bold'))

Make venn diagram showing the overlap between up- and down- regulated genes between MEF and ES/EB

up_in_EB <- rownames(y)[y$EB_pvalue < 1e-3 & y$EB_log2FoldChange > log2(2)]
up_in_MEF <- rownames(y)[y$MEF_pvalue < 1e-3 & y$MEF_log2FoldChange > 0.1] 
down_in_EB <- rownames(y)[y$EB_pvalue < 1e-3 & y$EB_log2FoldChange < -log2(2)] 
down_in_MEF <- rownames(y)[y$MEF_pvalue < 1e-3 & y$MEF_log2FoldChange < -0.1] 
listInput <- list('up in EB' = up_in_EB, 'up in MEF' = up_in_MEF, 'down in EB' = down_in_EB, 'down in MEF' = down_in_MEF)
upset(fromList(listInput), order.by = "freq", point.size = 3.5, line.size = 2, text.scale = c(1, 2, 1, 1, 1, 2))

sessionInfo()
## R version 3.5.2 (2018-12-20)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
##  [1] grid      parallel  stats4    stats     graphics  grDevices utils    
##  [8] datasets  methods   base     
## 
## other attached packages:
##  [1] org.Mm.eg.db_3.7.0                      
##  [2] UpSetR_1.4.0                            
##  [3] stringr_1.4.0                           
##  [4] goseq_1.34.1                            
##  [5] geneLenDataBase_1.18.0                  
##  [6] BiasedUrn_1.07                          
##  [7] TxDb.Mmusculus.UCSC.mm10.knownGene_3.4.4
##  [8] GenomicFeatures_1.34.8                  
##  [9] AnnotationDbi_1.44.0                    
## [10] circlize_0.4.8                          
## [11] ComplexHeatmap_2.3.1                    
## [12] wordcloud_2.6                           
## [13] DESeq2_1.22.2                           
## [14] ggplot2_3.2.1                           
## [15] dplyr_0.8.3                             
## [16] knitr_1.26                              
## [17] RColorBrewer_1.1-2                      
## [18] SummarizedExperiment_1.12.0             
## [19] DelayedArray_0.8.0                      
## [20] BiocParallel_1.16.6                     
## [21] matrixStats_0.55.0                      
## [22] Biobase_2.42.0                          
## [23] GenomicRanges_1.34.0                    
## [24] GenomeInfoDb_1.18.2                     
## [25] IRanges_2.16.0                          
## [26] S4Vectors_0.20.1                        
## [27] BiocGenerics_0.28.0                     
## 
## loaded via a namespace (and not attached):
##  [1] colorspace_1.4-1         rjson_0.2.20             htmlTable_1.13.3        
##  [4] XVector_0.22.0           GlobalOptions_0.1.1      base64enc_0.1-3         
##  [7] clue_0.3-57              rstudioapi_0.10          farver_2.0.1            
## [10] bit64_0.9-7              splines_3.5.2            geneplotter_1.60.0      
## [13] zeallot_0.1.0            Formula_1.2-3            Rsamtools_1.34.1        
## [16] annotate_1.60.1          cluster_2.1.0            GO.db_3.7.0             
## [19] png_0.1-7                compiler_3.5.2           httr_1.4.1              
## [22] backports_1.1.5          assertthat_0.2.1         Matrix_1.2-18           
## [25] lazyeval_0.2.2           acepack_1.4.1            htmltools_0.4.0         
## [28] prettyunits_1.0.2        tools_3.5.2              gtable_0.3.0            
## [31] glue_1.3.1               GenomeInfoDbData_1.2.0   Rcpp_1.0.3              
## [34] vctrs_0.2.0              Biostrings_2.50.2        nlme_3.1-143            
## [37] rtracklayer_1.42.2       xfun_0.11                lifecycle_0.1.0         
## [40] XML_3.98-1.20            zlibbioc_1.28.0          scales_1.1.0            
## [43] hms_0.5.2                yaml_2.2.0               memoise_1.1.0           
## [46] gridExtra_2.3            biomaRt_2.38.0           rpart_4.1-15            
## [49] latticeExtra_0.6-28      stringi_1.4.3            RSQLite_2.1.4           
## [52] genefilter_1.64.0        checkmate_1.9.4          shape_1.4.4             
## [55] rlang_0.4.2              pkgconfig_2.0.3          bitops_1.0-6            
## [58] evaluate_0.14            lattice_0.20-38          purrr_0.3.3             
## [61] labeling_0.3             GenomicAlignments_1.18.1 htmlwidgets_1.5.1       
## [64] bit_1.1-14               tidyselect_0.2.5         plyr_1.8.5              
## [67] magrittr_1.5             R6_2.4.1                 Hmisc_4.3-0             
## [70] DBI_1.0.0                pillar_1.4.2             foreign_0.8-72          
## [73] withr_2.1.2              mgcv_1.8-31              survival_3.1-8          
## [76] RCurl_1.95-4.12          nnet_7.3-12              tibble_2.1.3            
## [79] crayon_1.3.4             rmarkdown_1.18           GetoptLong_0.1.7        
## [82] progress_1.2.2           locfit_1.5-9.1           data.table_1.12.2       
## [85] blob_1.2.0               digest_0.6.23            xtable_1.8-4            
## [88] munsell_0.5.0